# library preparations
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import joblib
import seaborn as sns
import time
Given that we could just ask the simulation for more samples, I decided to NOT employ a typical training-split. Ratther, I will deploy all samples for training and optimize based on cross validation approaches, given every test record a turn at being in the training set and test set. Once the model is fitted, we can then ask the simulation for some more samples (say 1000) to use as a test set completely related here.
Many metrics can be used to evaluate models, some I calculate here are:

Sometimes, in the realm of health sciences, we want to punish or reward the model for doing something really well compared to other. For example, in cancer prediction, more emphasis is implaced on avoiding False Negatives (not detecting the cancer), so that we may wish to assign costs/weights (negative means reward) to TP, FP, TN, and FN like this:

This can be implemented as other alternative to above metrics during cross validation. Or, cost matrix can be used to classify one particular record. That is, we can use cost matrix to evaluate risk
With a RandomForest, I am able to extract the probability of a sample as showing SAS or not, then say:
P(SAS) = 0.2 P(neutral, other) = 0.8
Given the above cost matrix, then when I:
load_train = np.load('./data/train.npz', allow_pickle=True)
load_test = np.load('./data/test_large.npz', allow_pickle=True)
X_train, y_train = load_train['X_train'], load_train['y_train']
X_test, y_test = load_test['X_test_large'], load_test['y_test_large']
samples, rows, cols = X_train.shape
print("Before flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
Before flattening: (6500, 460, 12) (6500, 1) (6499, 460, 12) (6499, 1)
y_train = y_train.reshape(y_train.shape[0], )
y_test = y_test.reshape(y_test.shape[0], )
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
print("After flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
After flattening: (6500, 5520) (6500,) (6499, 5520) (6499,)
# Prepare for K fold cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
k = 10
# again, random_state for sake of reproducibility
kf = KFold(n_splits=k, random_state = 42, shuffle=True)
# default confusion matrix
# we can calculate TP, FP, FN, TN from this matrix at the end
matrix = pd.DataFrame(0, index=['True Yes', 'True No', ''],
columns=['Pred Yes', 'Pred No', ''])
start = time.time()
for train_index, test_index in kf.split(X_train):
# identify the train and test set within each fold
X_fold_train, X_fold_test = X_train[train_index], X_train[test_index]
y_fold_train, y_fold_test = y_train[train_index], y_train[test_index]
# fit the model on the training set
# model = RandomForestClassifier(n_estimators=500, oob_score=True,
# min_samples_leaf = 10, max_depth = 3,
# min_samples_split = 100, max_leaf_nodes = 50,
# max_features = 'sqrt', n_jobs=8)
model = RandomForestClassifier(n_estimators=500, oob_score=True,
max_features = 'sqrt', n_jobs=8)
model.fit(X_fold_train, y_fold_train)
# predict label on validation test set, record results
y_pred = model.predict(X_fold_test)
y_prob = model.predict_proba(X_fold_test)[:,1]
# collect metrics
m = pd.crosstab(y_pred, y_fold_test, margins=True)
m.index = ['True Yes', 'True No', '']
m.columns = ['Pred Yes', 'Pred No', '']
matrix += m
end = time.time()
print('Time taken:', end - start)
print(matrix)
TP, FN = matrix.iloc[0, 0], matrix.iloc[0, 1]
FP, TN = matrix.iloc[1, 0], matrix.iloc[1, 1]
total = TP + FN + FP + TN
print("Accuracy: \t\t\t\t\t\t\t", (TP + TN) / total)
print("Error Rate: \t\t\t\t\t\t\t", (FN + FP) / total)
recall = TP / (TP + FN)
print("True Positive Rate (TPR) | Sensitivity | Recall | Coverage\t", recall)
print("True Negative Rate (TNR) | Specificity \t\t\t\t",
TN / (FP + TN))
print("False Positive Rate (FPR) is \t\t\t\t\t", FP / (FP + TN))
print("False Negative Rate (FNR) is \t\t\t\t\t", FN / (TP + FN))
precision = TP / (TP + FP)
print("Average Precision: \t\t\t\t\t\t", precision)
print("Average Recall: \t\t\t\t\t\t", recall)
print("Average F Measures: \t\t\t\t\t\t",
(2*precision*recall) / (precision+recall))
pred_prob = model.predict_proba(X_train)[:, 1]
print("Average AUC: \t\t\t\t\t\t\t", roc_auc_score(y_train, pred_prob))
Time taken: 192.31781482696533
Pred Yes Pred No
True Yes 2160 132 2292
True No 285 3923 4208
2445 4055 6500
Accuracy: 0.9358461538461539
Error Rate: 0.06415384615384616
True Positive Rate (TPR) | Sensitivity | Recall | Coverage 0.9424083769633508
True Negative Rate (TNR) | Specificity 0.9322718631178707
False Positive Rate (FPR) is 0.06772813688212928
False Negative Rate (FNR) is 0.05759162303664921
Average Precision: 0.8834355828220859
Average Recall: 0.9424083769633508
Average F Measures: 0.9119696010132995
Average AUC: 0.999622219028239
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# n_estimators = number of trees to bag
# Fit the model on all training samples possible
# clf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None,
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None,
# random_state=None, verbose=0, warm_start=False, class_weight=None,
# ccp_alpha=0.0, max_samples=None)
# Tune: min_samples_split, max_leaf_nodes, max_depth and min_samples_leaf.
model = RandomForestClassifier(n_estimators=500, oob_score=True,
max_features = 'sqrt', n_jobs=8)
# model = RandomForestClassifier(n_estimators=500, oob_score=True,
# min_samples_leaf = 10, max_depth = 5,
# min_samples_split = 100, max_leaf_nodes = 50,
# max_features = 'sqrt', n_jobs=8)
start = time.time()
model.fit(X_train, y_train)
end = time.time()
print('Time taken:', end - start)
# save the model
joblib.dump(model, "./data/random_forest_10000.joblib")
Time taken: 20.711103200912476
['./data/random_forest_10000.joblib']
features_index = ["pi_x", "pi_y", "Pi_tot", "Fst", "Dxy", "Da", "Tajima's D on X",
"Tajima's D on Y", "Tajima's D across all samples", "SNP rel. dens. on X",
"SNP rel. dens. on Y", "SNP rel. dens. across all"]
for f, imp in zip(features_index, model.feature_importances_):
print(f'Feature {f} importance: {imp} \t\t\t\t\t\t')
features = model.feature_importances_.reshape(rows, cols).mean(axis=0)
feature_imp = pd.Series(features,index=features_index).sort_values(
ascending=False)
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.title("Important Features In Detecting SAS")
plt.show()
Feature pi_x importance: 0.0005743048689997335 Feature pi_y importance: 0.0021415111169191493 Feature Pi_tot importance: 0.00035428357120380794 Feature Fst importance: 0.0005739386507464786 Feature Dxy importance: 0.00024654234893850514 Feature Da importance: 0.0005071148001621412 Feature Tajima's D on X importance: 0.00015903623130286053 Feature Tajima's D on Y importance: 0.0007098946306671452 Feature Tajima's D across all samples importance: 0.0006779736977531289 Feature SNP rel. dens. on X importance: 0.007434963139724416 Feature SNP rel. dens. on Y importance: 0.014047141596306439 Feature SNP rel. dens. across all importance: 0.007646971899971477
# Extract single tree
estimator = model.estimators_[203]
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot',
feature_names = features_index * rows,
class_names = ["SAS", "Neutral"],
rounded = True, proportion = False,
precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')